import requests
from bs4 import BeautifulSoup
import time

def gain_repeat(url,headers):#不断地重连
    while True:
        try:
            html = requests.get(url=url, headers=headers)
            return html
        except:
            print('远程主机强制关闭了一个现有的链接，正在重新连接……')
            time.sleep(2)
            continue

def down_pic(url,header):
    html = gain_repeat(url=url, headers=header)
    html.encoding = 'utf-8'
    soup = BeautifulSoup(html.text, 'lxml')
    n = 0
    name_image = soup.find_all('a', href='javascript:;')[1].text
    for i in soup.find_all('img'):
        n = n + 1
        image = gain_repeat(url=i['data-original'], headers=header)
        address_file = '.\\已爬取的图片\\' + name_image + '（' + str(n) + '）.jpg'
        with open(address_file, 'wb') as f:
            f.write(image.content)
        print('成功爬取第' + str(n) + '张')
    print('爬取完毕:' + name_image)
    time.sleep(0.5)

def down_list(station_url,list_url_pic,header,pages):#网站总网址，二级网址，访问头，爬取页数
    a_url=list_url_pic
    url_list = []
    for i in range(pages):
        html = gain_repeat(url=a_url, headers=header)
        html.encoding = 'utf-8'
        soup = BeautifulSoup(html.text, 'lxml')
        for i in soup.find('div', id='tpl-img-content').find_all('a'):
            url_list.append(station_url + i['href'])
        try:
            a_url = station_url + soup.find('a', title='下一页')['href']
        except:
            break
        time.sleep(0.5)
    return url_list

def main():
    URL = 'https://www.501ii.com'
    url = 'https://www.501ii.com/tupian/list-%E7%BE%8E%E8%85%BF%E4%B8%9D%E8%A2%9C-6.html'
    send_header = {
        # 'cookie': '__cfduid=d5e0a8a3dc3a13ebd8a48769c7fb667371556043067; Hm_lvt_9b001039deb0fa7373db94a87f9c6010=1556043115; tiaoss=0; _gat_gtag_UA_126205200_1=1; Hm_lvt_427f72ce75b0677eb10f24419484eb80=1556080872,1556083028,1556083330,1556102688; Hm_lpvt_427f72ce75b0677eb10f24419484eb80=1556102697; _ga=GA1.2.1022295756.1556043070; _gid=GA1.2.336364605.1556043070; playss=29',
        # 'referer':'https://www.501ii.com',
        'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'}
    page=0
    for i in down_list(station_url=URL,list_url_pic=url,header=send_header,pages=67):
        page=page+1
        down_pic(url=i,header=send_header)
        print('第'+str(page)+'页爬取完毕')
    print('全部爬取完毕')

if __name__ == '__main__':
    main()
    # URL = 'https://www.501ii.com'
    # url = 'https://www.501ii.com/tupian/list-%E7%BE%8E%E8%85%BF%E4%B8%9D%E8%A2%9C-1.html'
    # send_header = {
    #     # 'cookie': '__cfduid=d5e0a8a3dc3a13ebd8a48769c7fb667371556043067; Hm_lvt_9b001039deb0fa7373db94a87f9c6010=1556043115; tiaoss=0; _gat_gtag_UA_126205200_1=1; Hm_lvt_427f72ce75b0677eb10f24419484eb80=1556080872,1556083028,1556083330,1556102688; Hm_lpvt_427f72ce75b0677eb10f24419484eb80=1556102697; _ga=GA1.2.1022295756.1556043070; _gid=GA1.2.336364605.1556043070; playss=29',
    #     # 'referer':'https://www.501ii.com',
    #     'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'}
    # html=requests.get(url=url,headers=send_header)
    # print(html.text)